********************************************************************************
*Clean HCRIS data

*Data processing steps based on code by Adam Sacarny, available here: https://github.com/asacarny/hospital-cost-reports/blob/master/import-source-cms.do
********************************************************************************
global SOURCE_BASE "\replication\public_data\hcris_data"
global STARTYEAR = 2010
global ENDYEAR = 2019
global STARTYEAR_HY = $STARTYEAR+1
global ENDYEAR_HY = $ENDYEAR-1

cd "$SOURCE_BASE"
capture mkdir output

********************************************************************************
*1. PROCESS FILES
*1.1 import the lookup table, which contains variable locations and types 
{
import excel using "lookup.xlsx", firstrow sheet("Lookup Table") clear
keep rec wksht_cd clmn_num line_num fmt enabled
isid wksht_cd clmn_num line_num fmt
tempfile lookup
save `lookup'

keep wksht_cd fmt
tab wk
gduplicates drop
tempfile worksheets
save `worksheets'

use rec enabled using `lookup'
collapse (max) enabled, by(rec) fast
tempfile enabled
save `enabled'

*var types and labels
import excel using "lookup.xlsx", firstrow sheet("Type and Label") clear
keep rec type label
drop if missing(rec)

merge 1:1 rec using `enabled', keep(master match) 
drop _m 

*locals with var types 
local type_dollar_flow
local type_flow
local type_stock
qui count
forvalues i=1/`r(N)' {
	if (enabled[`i']) {
		local cur_rec = rec[`i']
		local cur_type = type[`i']
		local L_`cur_rec' = label[`i']

		assert inlist("`cur_type'","dollar_flow","flow","stock")
		local type_`cur_type' = "`type_`cur_type'' `cur_rec'"
	}
}
clear
}

*1.2 process cost reports by year 
forvalues year = $STARTYEAR/$ENDYEAR {
	if (`year'>= 2011) {
		local formats "10"
	}
	else if (`year'==2010 | `year'==2011) {
		local formats "96 10"
	}
	else {
		local formats "96"
	}
	
	foreach fmt in `formats' {
		*(A) PROCESS NUMERIC FILE
		di "processing HOSP`fmt'_`year'_NMRC.dta"
		
		*keep relevant worksheets 
		use `worksheets', clear
		keep if fmt==`fmt'
		drop fmt
		sort wksht_cd
		merge 1:m wksht_cd using "HOSP`fmt'_`year'_NMRC.dta", keepusing(rpt_rec itm_val_num wksht_cd clmn_num line_num)
		keep if _m==3
		drop _m
		
		*collapse obs
		assert !missing(itm_val_num)		
		gen count = 1
		collapse (sum) itm_val_num count, by(rpt_rec_num wksht_cd clmn_num line_num) fast
		
		*label vars using locations
		gen fmt = `fmt'
		merge m:1 wksht_cd clmn_num line_num fmt using `lookup'
		keep if enabled==1 
		count if fmt==`fmt' & _m!=3
		assert r(N)==0 
		keep if _m==3
		keep rpt_rec_num rec itm_val_num

		*reshape to one row per report
		rename itm_val_num val_
		reshape wide val, i(rpt_rec_num) j(rec) string
		
		*rename vars 
		foreach var in `type_dollar_flow' `type_flow' `type_stock' {
			capture confirm variable val_`var'
			if (!_rc) {
				rename val_`var' `var'
			}
		}
		
		qui compress 
		tempfile nmrc
		save `nmrc', replace

		*(B) PROCESS INDEX FILE 
		use "HOSP`fmt'_`year'_RPT", clear
		keep rpt_rec_num prvdr_num rpt_stus_cd fy_bgn_dt fy_end_dt proc_dt
		merge 1:1 rpt_rec_num using `nmrc'
		tab _m
		drop if _m==2
		drop _m 

		gen year = `year'
		gen fmt = `fmt'		
		save "output\merged`year'_`fmt'", replace
	}
	
	*some years have multiple files because they are reported in 1996 and 2010 
	*formats; append these.
	clear
	foreach fmt in `formats' {
	if (_N==0) {
	use "output\merged`year'_`fmt'", clear
	}
	else {
	append using "output\merged`year'_`fmt'"
	}	
	}
	save "output\merged`year'", replace
	clear
}

*1.3 append years 
{
clear 
forvalues year=$STARTYEAR/$ENDYEAR {
	capture noisily append using "output\merged`year'"
}

isid rpt_rec_num year
rename prvdr_num pn
sort pn year fy_bgn_dt fmt
order rpt_rec_num pn year fy_bgn_dt fmt
label data "cms hospital cost report data"
label var fmt "report format (96=1996 10=2010)"
foreach var of varlist `type_dollar_flow' `type_flow' `type_stock' {
label var `var' "`L_`var''"
}
order rpt_rec_num pn year fmt fy_bgn_dt fy_end_dt rpt_stus_cd proc_dt `type_stock' `type_flow' `type_dollar_flow'
qui compress
sort pn
save "output\hcris_merged.dta", replace 
}

********************************************************************************
*2. CONSTRUCT HOSP YEAR SYNTHETIC FILE (CALENDARIZE)
cd "output"
	use "hcris_merged.dta", clear
	{
	*Note: my data contain no stock vars, so stock cleaning code is omitted
	keep rpt* pn year fmt fy* proc_dt s??_* g3_*
	local type_dollar_flow s32_* g3_*
	unab type_dollar_flow: `type_dollar_flow'
	
	*apportion each report to the years it spans
	{
	rename year hcris_year
	gen year_base     = year(fy_bgn_dt)
	gen years_spanned = year(fy_end_dt) - year(fy_bgn_dt) + 1
	isid rpt_rec_num hcris_year
	expand years_spanned

	egen seq = seq(), from(0) by(rpt_rec_num hcris_year)
	gen year = year_base+seq
	drop year_base seq

	*days of the target year covered by the cost report
	assert year(fy_bgn_dt)<=year & year(fy_end_dt)>=year
	gen first_day_in_year = max(mdy(1,1,year),fy_bgn_dt)
	format first_day_in_year %td
	gen last_day_in_year = min(mdy(12,31,year),fy_end_dt)
	format last_day_in_year %td

	gen days_in_year = last_day_in_year-first_day_in_year+1
	gen days_spanned = fy_end_dt-fy_bgn_dt+1

	*share of the report's days that fell into the target year
	gen frac_rpt_in_year = days_in_year/days_spanned
	egen totfrac = sum(frac_rpt_in_year), by(rpt_rec_num hcris_year)
	assert totfrac==1
	drop totfrac

	*share of the target year's days that were covered by the report
	gen double frac_year_covered = days_in_year/(mdy(12,31,year)-mdy(1,1,year)+1)
	gen nreports = 1
	gen nfmt96 = fmt==96
	gen nfmt10 = fmt==10
	tab fmt 
	}
	
*collapse flow variables to the hospital-year level
	*scale by the fraction of the year that the flow represents, then sum to total 365 days
	{	
	preserve
	foreach var in `type_flow' `type_dollar_flow' {
	
	*normalize to 1year's flow
	replace `var' = `var' / days_spanned * 365
	
	*weight by fraction of report that's in calyear
	gen wt = frac_year_covered if !missing(`var')
	bysort pn year: gegen wtsum = sum(wt)
	replace wt = wt / wtsum
	replace `var' = `var' * wt
	rename `var' `var'2
	bysort pn year: gegen `var' = sum(`var'2), missing
	drop wt wtsum `var'2
	}
	
	keep `type_flow' `type_dollar_flow' pn year 
	gduplicates drop
	gduplicates report pn year
	sort pn year
	tempfile collapsed_flows
	save `collapsed_flows', replace
	restore
	}
	
	*some variables should never be missing
	foreach var of varlist frac_year_covered nreports nfmt96 nfmt10 first_day_in_year last_day_in_year pn year {
	assert !missing(`var')
	}

	*collapse other vars to the hospital-year level
	collapse (sum) frac_year_covered nreports nfmt96 nfmt10 (min) covg_begin_dt=first_day_in_year (max) covg_end_dt=last_day_in_year, by(pn year)

	*merge in previously collapsed flow variables
	foreach collapsed_file in collapsed_flows {
	merge 1:1 pn year using ``collapsed_file'', assert(match) nogenerate
	}

	drop if year < $STARTYEAR_HY | year > $ENDYEAR_HY
	tab year
	
	*Inflate dollar flow vars to (2017)
	*CPI-U All Urban Consumers 
	scalar inf_11 = 1.0897 
	scalar inf_12 = 1.0676 
	scalar inf_13 = 1.0522
	scalar inf_14 = 1.0354 
	scalar inf_15 = 1.0342 
	scalar inf_16 = 1.0213 
	scalar inf_17 = 1.0000 
	scalar inf_18 = 0.9762 
	scalar inf_19 = 0.9588 
	foreach var in `type_dollar_flow' {
	forval i = 11/19 {
	qui replace `var' = `var'*inf_`i' if year==20`i'
	}	
	}

	label data "cms hospital cost report data (synthetic calendar year)"
	label var year "year"
	label var frac_year_covered "sum of days in reports / days in year"
	label var nreports "number of cost reports included in row"
	label var nfmt96 "number of 1996 format cost reports included in row"
	label var nfmt10 "number of 2010 format cost reports included in row"
	label var covg_begin_dt "first day in year with cost report coverage in row"
	label var covg_end_dt "last day in year with cost report coverage in row"
	order pn year, last
	capture order `type_flow' `type_dollar_flow', last
	qui compress
	sort pn ye
	}
	save "hcris_merged_hospyear.dta", replace